title: “Case Study 2” author: “Fabio” date: “8/2/2019” output: html_document: toc: TRUE toc_float: collapsed: true smooth_scroll: false theme: paper df_print: paged


Executive Summary

  • We loaded the data

  • We performed an extensive exploratory analysis of the different variables in the data set in order to predict attrition and monthly income.

  • We finally after a lot of trial and error choose the Naive bayes model to predict attrition with accuracy od 0,7373, Sensitivity 0.7473, Specificity 0.6857.

  • The variables that we used for our Naive bayes model were the following : StockOptionLevel, JobLevel, MonthlyIncome, OverTime.

  • The 3 most important variables that we found in this dataset which are related to attrition are the following : MonthlyIncome, OverTime, StockOptionLevel.

  • Also the longer that the employee spend in the company or with the current manager, or the longer that the employee are working in general it is less likely to leave their job. Other trend, single employee tend to have more attrition, the further that an employee live from work the more likely of attrition.

  • We choose the regression model to predict monthly income and it perform with an Adjusted R square of 0.9494, RMSE of 927.0692.

  • The most inportant variables that we used for the correlation model to predict monthly income were the following : TotalWorkingYears, JobLevel, JobRole.

Depedencies

library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)
library(tidyverse)
library(stringr)
library(caret)
library(fpp2)
library(dygraphs)
library(xts)
library(pander)
library(purrr)
library(ggthemes)
library(gridExtra)
library(cowplot)
library(RColorBrewer)
library(gplots)
library(corrplot) 
library(functional)
library(fastNaiveBayes)
library(ggpubr)

Loading the data

dfdata<- readr::read_csv("data/CaseStudy2-data.csv")
head(dfdata)

Atrittion calculation

dfdata %>% count(Attrition) ->att
att

Exploratory plot for Attrition

You can also embed plots, for example:

agePlot <- ggplot(dfdata,aes(Age,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
agePlot

travelPlot <- ggplot(dfdata,aes(BusinessTravel,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
travelPlot

ratePlot <- ggplot(dfdata,aes(DailyRate, fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")
ratePlot

depPlot <- ggplot(dfdata,aes(Department,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
depPlot

distPlot<- ggplot(dfdata,aes(DistanceFromHome,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
distPlot

eduPlot <- ggplot(dfdata,aes(Education,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #maybe
eduPlot

edufieldPlot <- ggplot(dfdata,aes(EducationField,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
edufieldPlot

envPlot <- ggplot(dfdata,aes(EnvironmentSatisfaction,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# maybe
envPlot

genPlot <- ggplot(dfdata,aes(Gender,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #maybe
genPlot

hourlyPlot <- ggplot(dfdata,aes(HourlyRate,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
hourlyPlot

jobInvPlot <- ggplot(dfdata,aes(JobInvolvement,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") #important
jobInvPlot

jobLevelPlot <- ggplot(dfdata,aes(JobLevel,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
jobLevelPlot

jobSatPlot <- ggplot(dfdata,aes(JobSatisfaction,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
jobSatPlot

overTimePlot <- ggplot(dfdata,aes(OverTime,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium") # important
overTimePlot

hikePlot <- ggplot(dfdata,aes(PercentSalaryHike, fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
hikePlot

perfPlot <- ggplot(dfdata,aes(PerformanceRating,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# maybe
perfPlot

RelSatPlot <- ggplot(dfdata,aes(RelationshipSatisfaction,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
RelSatPlot

StockPlot <- ggplot(dfdata,aes(StockOptionLevel,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
StockPlot

workingYearsPlot <- ggplot(dfdata,aes(TotalWorkingYears,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
workingYearsPlot

TrainTimesPlot <- ggplot(dfdata,aes(TrainingTimesLastYear,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
TrainTimesPlot

 WLBPlot<- ggplot(dfdata,aes(WorkLifeBalance,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")# important
  WLBPlot

marPlot <- ggplot(dfdata,aes(MaritalStatus,fill=Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")#maybe
marPlot

monthlyIncPlot <- ggplot(dfdata,aes(MonthlyIncome,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")# important
monthlyIncPlot

monthlyRatePlot <- ggplot(dfdata,aes(MonthlyRate,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium")
monthlyRatePlot

numCompPlot <- ggplot(dfdata,aes(NumCompaniesWorked,fill=Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
numCompPlot

YearAtComPlot <- ggplot(dfdata,aes(YearsAtCompany,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
YearAtComPlot

YearInCurrPlot <- ggplot(dfdata,aes(YearsInCurrentRole,fill = Attrition))+geom_bar()+theme_tufte()+scale_fill_few("Medium")
YearInCurrPlot

YearsSinceProm <- ggplot(dfdata,aes(YearsSinceLastPromotion,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") #maybe
YearsSinceProm 

YearsCurrManPlot <- ggplot(dfdata,aes(YearsWithCurrManager,fill = Attrition))+geom_density()+theme_tufte()+scale_fill_few("Medium") # important
YearsCurrManPlot

myplot <- ggplot(dfdata, aes(BusinessTravel, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(Department, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(Education, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(EducationField, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(EnvironmentSatisfaction, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # maybe

myplot <- ggplot(dfdata, aes(Gender, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(JobInvolvement, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(JobLevel, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important

myplot <- ggplot(dfdata, aes(JobSatisfaction, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(OverTime, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important

myplot <- ggplot(dfdata, aes(RelationshipSatisfaction, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(StockOptionLevel, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important

myplot <- ggplot(dfdata, aes(WorkLifeBalance, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(MaritalStatus, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # maybe

myplot <- ggplot(dfdata, aes(YearsInCurrentRole, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot #maybe

myplot <- ggplot(dfdata, aes(YearsWithCurrManager, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important

myplot <- ggplot(dfdata, aes(YearsAtCompany, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # important/maybe

myplot <- ggplot(dfdata, aes(NumCompaniesWorked, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(TrainingTimesLastYear, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(TotalWorkingYears, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # maybe

myplot <- ggplot(dfdata, aes(DistanceFromHome, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot # maybe

myplot <- ggplot(dfdata, aes(Age, group = Attrition)) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~Attrition)

myplot

myplot <- ggplot(dfdata, aes(EducationField, group = JobSatisfaction )) + 
          geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
          scale_y_continuous(labels=scales::percent) +
          ylab("relative frequencies") +
          facet_grid(~JobSatisfaction)

myplot

Ecploratory plot for Montly income. This is with numerical values

ys <- names(dfdata)[c(2, 5, 7, 14, 20, 21, 22, 25, 30, 31, 33, 34, 35, 36)]
ys %>% map(function(y) 
  ggplot(dfdata , aes(MonthlyIncome)) + geom_point(aes_string(y=y)))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

#### More exploratory data for montly income with heatmap for numerical values

dfn <-dfdata[c(2, 5, 7, 14, 20, 21, 22, 25, 30, 31, 33, 34, 35, 36 )]

my_palette <- colorRampPalette(c("red", "white", "black"))
heatmapper <- function(df){
  df %>%
    keep(is.numeric) %>%
    tidyr::drop_na() %>%
    cor %>%
    heatmap.2(col = my_palette ,
              density.info = "none", trace = "none",
              dendogram = c("both"), symm = F,
              symkey = T, symbreaks = T, scale = "none",
              key = T)
}


heatmapper(dfn)

More exploratory data for montly income with corrplot for numerical values

correlator  <-  function(df){
  df %>%
    keep(is.numeric) %>%
    tidyr::drop_na() %>%
    cor %>%
    corrplot( addCoef.col = "white", number.digits = 2,
              number.cex = 0.5, method="square",
              order="hclust", title="Variable Corr Heatmap",
              tl.srt=45, tl.cex = 0.8)
}

correlator(dfn)

#### More exploratory data for montly income with ggplot numeric for numerical values

plotAllNumeric <- function(df){
  df%>%keep(is.numeric) %>%
    gather() %>%
    ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density()+geom_histogram() + theme_fivethirtyeight()
}


plotAllNumeric(dfn)

More exploratory data for montly income for categorical values

dfdata %>% keep(is.factor) %>% names -> label
ggplot(data = dfdata, aes(x = BusinessTravel, y = MonthlyIncome, fill =BusinessTravel )) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = Department, y = MonthlyIncome, fill = Department)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(Education), y = MonthlyIncome, fill = as.factor(Education))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # important

ggplot(data = dfdata, aes(x = EducationField, y = MonthlyIncome, fill = EducationField)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(EnvironmentSatisfaction), y = MonthlyIncome, fill = as.factor(EnvironmentSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = Gender, y = MonthlyIncome, fill = Gender)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(JobInvolvement), y = MonthlyIncome, fill = as.factor(JobInvolvement))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(JobLevel), y = MonthlyIncome, fill = as.factor(JobLevel))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # important

ggplot(data = dfdata, aes(x = JobRole, y = MonthlyIncome, fill = JobRole)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()# important

ggplot(data = dfdata, aes(x = as.factor(JobSatisfaction), y = MonthlyIncome, fill = as.factor(JobSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = MaritalStatus, y = MonthlyIncome, fill = MaritalStatus)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = OverTime, y = MonthlyIncome, fill = OverTime)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(PerformanceRating), y = MonthlyIncome, fill = as.factor(PerformanceRating))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() #maybe

ggplot(data = dfdata, aes(x = as.factor(RelationshipSatisfaction), y = MonthlyIncome, fill = as.factor(RelationshipSatisfaction))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

ggplot(data = dfdata, aes(x = as.factor(StockOptionLevel), y = MonthlyIncome, fill = as.factor(StockOptionLevel))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() # maybe

ggplot(data = dfdata, aes(x = as.factor(WorkLifeBalance), y = MonthlyIncome, fill =as.factor(WorkLifeBalance))) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()

Cutting the variables for the dataframe that will be used in the model, witht the intention to have better performance of the model.

dfdata$YearsInCurrentRole<- cut(as.numeric(dfdata$YearsInCurrentRole), breaks = c(-1,1,100))
dfdata$TotalWorkingYears<- cut(as.numeric(dfdata$TotalWorkingYears), breaks = c(-1,1,100))
dfdata$YearsWithCurrManager<- cut(as.numeric(dfdata$YearsWithCurrManager), breaks = c(-1,1,100))
dfdata %>% filter(MaritalStatus %in% c("Single", "Divorce"))-> MaritalStatus
dfdata$MaritalStatus <- as.factor(dfdata$MaritalStatus)
levels(dfdata$MaritalStatus) <- c("NotMarried", "Married", "NotMarried")

Setting the dataframe for the model. After a lot of trial, this was the best model for attrition, we used Naive bayes model.

clasy<- dfdata[c("Attrition", "StockOptionLevel", "JobLevel", "MonthlyIncome", "OverTime")]

clasy[c("Attrition","StockOptionLevel", "JobLevel", "OverTime")] <- lapply(clasy[c("Attrition","StockOptionLevel", "JobLevel", "OverTime")], as.factor)
head(clasy)

Split the train and test data

set.seed(3033)
split <- function(df, p = 0.75, list = FALSE, ...) {
  train_ind <- createDataPartition(df[[1]], p = p, list = list)
  cat("creating training dataset...\n")
  training <<- df[train_ind, ]
  cat("completed training dataset, creating test set\n")
  test <<- df[-train_ind, ]
  cat("done")
}

split(clasy)
## creating training dataset...
## completed training dataset, creating test set
## done

We run the Naive bayes model

library(doParallel)
numcores <- parallel::detectCores() - 1
cl <- makePSOCKcluster(numcores)
registerDoParallel(cl)


set.seed(3333)
trainMethod <- trainControl( method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)

fit.nb1 <- train(Attrition ~ .,  data = training, method = "nb", metric = "Spec", trControl = trainMethod)

fit.nb1
## Naive Bayes 
## 
## 653 samples
##   4 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times) 
## Summary of sample sizes: 627, 626, 627, 628, 626, 626, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  ROC        Sens       Spec  
##   FALSE      0.7232026  0.7445195  0.6584
##    TRUE      0.7668199  0.9992727  0.0000
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Spec was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE
##  and adjust = 1.
plot(fit.nb1)

Sensitivity : 0.7473

Specificity : 0.6857

Accuracy : 0.7373

test_pred <- predict(fit.nb1, newdata = test)
test_pred
##   [1] No  Yes No  Yes Yes No  No  No  No  Yes No  No  No  No  No  No  Yes
##  [18] No  Yes No  No  Yes Yes Yes Yes No  No  No  No  No  No  No  Yes No 
##  [35] No  No  Yes No  No  No  No  No  No  No  No  No  No  No  No  No  No 
##  [52] No  No  No  No  Yes No  No  Yes No  No  No  Yes No  No  No  Yes No 
##  [69] No  Yes No  No  No  No  Yes No  Yes Yes No  Yes No  No  No  Yes No 
##  [86] Yes No  Yes No  Yes No  No  No  Yes No  Yes No  No  No  No  Yes No 
## [103] Yes No  Yes No  No  No  Yes No  No  Yes No  No  Yes No  No  No  No 
## [120] No  Yes Yes Yes No  No  No  No  Yes No  No  Yes Yes Yes No  Yes Yes
## [137] Yes No  No  No  No  No  No  Yes No  No  No  No  Yes Yes Yes Yes No 
## [154] No  Yes No  Yes No  No  Yes No  No  No  No  No  No  No  Yes No  No 
## [171] Yes Yes No  Yes No  No  No  No  Yes Yes Yes No  Yes No  No  Yes Yes
## [188] Yes Yes No  No  No  No  No  No  Yes Yes No  No  Yes No  Yes No  Yes
## [205] No  No  Yes No  No  No  Yes No  No  No  No  No  No 
## Levels: No Yes
confusionMatrix(test_pred, test$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  136  11
##        Yes  46  24
##                                           
##                Accuracy : 0.7373          
##                  95% CI : (0.6735, 0.7946)
##     No Information Rate : 0.8387          
##     P-Value [Acc > NIR] : 0.9999          
##                                           
##                   Kappa : 0.3084          
##                                           
##  Mcnemar's Test P-Value : 6.687e-06       
##                                           
##             Sensitivity : 0.7473          
##             Specificity : 0.6857          
##          Pos Pred Value : 0.9252          
##          Neg Pred Value : 0.3429          
##              Prevalence : 0.8387          
##          Detection Rate : 0.6267          
##    Detection Prevalence : 0.6774          
##       Balanced Accuracy : 0.7165          
##                                           
##        'Positive' Class : No              
## 
stopCluster(cl)

Setting the dataframe for monthly income, we will use regression.

dfdata<- readr::read_csv("data/CaseStudy2-data.csv")
reg <- dfdata[c("TotalWorkingYears", "YearsAtCompany", "Age", "Education", "JobLevel", "JobRole", "MonthlyIncome", "PerformanceRating", "StockOptionLevel" )]

reg[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")] <- lapply(reg[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")], as.factor)

reg

Train and test split

set.seed(3033)
split <- function(df, p = 0.75, list = FALSE, ...) {
  train_ind <- createDataPartition(df[[1]], p = p, list = list)
  cat("creating training dataset...\n")
  training <<- df[train_ind, ]
  cat("completed training dataset, creating test set\n")
  test <<- df[-train_ind, ]
  cat("done")
}

split(reg)
## creating training dataset...
## completed training dataset, creating test set
## done

After alot of trial and error and review of the visualization above we picked up the best model, which is the one below.

RMSE = 927.0692

regincome1 <- lm( MonthlyIncome ~ TotalWorkingYears + JobLevel + JobRole, data  = training)
summary(regincome1)
## 
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobLevel + JobRole, 
##     data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3085.0  -647.5   -97.3   638.3  4284.1 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3642.521    220.489  16.520  < 2e-16 ***
## TotalWorkingYears                53.411      9.134   5.847 7.96e-09 ***
## JobLevel2                      1598.495    170.893   9.354  < 2e-16 ***
## JobLevel3                      4650.058    226.324  20.546  < 2e-16 ***
## JobLevel4                      7872.592    345.320  22.798  < 2e-16 ***
## JobLevel5                     10669.268    394.495  27.045  < 2e-16 ***
## JobRoleHuman Resources        -1242.551    320.507  -3.877 0.000117 ***
## JobRoleLaboratory Technician  -1309.554    212.884  -6.151 1.35e-09 ***
## JobRoleManager                 3557.429    293.615  12.116  < 2e-16 ***
## JobRoleManufacturing Director   239.931    187.576   1.279 0.201319    
## JobRoleResearch Director       3701.321    252.276  14.672  < 2e-16 ***
## JobRoleResearch Scientist     -1091.211    216.239  -5.046 5.87e-07 ***
## JobRoleSales Executive            8.403    162.191   0.052 0.958697    
## JobRoleSales Representative   -1399.737    261.799  -5.347 1.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1036 on 641 degrees of freedom
## Multiple R-squared:  0.9504, Adjusted R-squared:  0.9494 
## F-statistic:   944 on 13 and 641 DF,  p-value: < 2.2e-16
pred <- predict(regincome1, newdata = test)
str(pred)
##  Named num [1:215] 9067 9049 2872 9013 5668 ...
##  - attr(*, "names")= chr [1:215] "1" "2" "3" "4" ...
ASA2 <- mean((pred[1:nrow(test)] - test$MonthlyIncome)^2)
sqrt(ASA2)
## [1] 927.0692

AIC and BIC

AIC(regincome1)
## [1] 10970.58
BIC(regincome1)
## [1] 11037.85

We run our model against the test data

pred2 <- predict(regincome1, newdata = test)
pred2
##         1         2         3         4         5         6         7 
##  9066.617  9048.732  2871.775  9013.207  5668.301  2560.202  2934.077 
##         8         9        10        11        12        13        14 
##  5730.115  5908.232  5854.821  5775.123  5783.526  2867.074  9102.143 
##        15        16        17        18        19        20        21 
##  6103.990 19792.003  8781.679  6068.464  2867.074  2403.017  8835.089 
##        22        23        24        25        26        27        28 
##  8835.089  9529.428  2386.378  6015.054 13357.364  6317.632  5775.123 
##        29        30        31        32        33        34        35 
## 16300.988  6148.998  8995.321  5569.883  2818.364  2764.953  6095.587 
##        36        37        38        39        40        41        42 
##  4465.569  6015.054  5218.019 12858.783  4412.158  2760.253 13517.596 
##        43        44        45        46        47        48        49 
##  5801.411  2386.378  4683.912 16658.522  2658.132  8781.679  9013.207 
##        50        51        52        53        54        55        56 
## 19508.610  2867.074  9574.436  2386.378 12698.551  2978.596 18990.843 
##        57        58        59        60        61        62        63 
##  2818.364  2604.721 13117.433  6015.054 12858.783  6103.990 19151.075 
##        64        65        66        67        68        69        70 
## 13008.704  2563.249  9120.028  5890.347  2604.721 12848.472  5676.704 
##        71        72        73        74        75        76        77 
##  2653.431 16071.005  2386.378  5783.526  4321.975  5836.936  2658.132 
##        78        79        80        81        82        83        84 
##  2813.664  5569.883  4732.622  4518.980 12741.650  2600.021  2871.775 
##        85        86        87        88        89        90        91 
##  6015.054  5454.659  5164.608 16300.988  9048.732 19471.539  5676.704 
##        92        93        94        95        96        97        98 
## 12645.140  5801.411  5004.376  5668.301  9529.428  6050.579  2818.364 
##        99       100       101       102       103       104       105 
## 19188.146  4732.622  2764.953  4683.912 16925.576  5997.168  2706.842 
##       106       107       108       109       110       111       112 
## 16728.273  8835.089  2973.896  4950.965  2546.610  5775.123  2827.256 
##       113       114       115       116       117       118       119 
##  2871.775  5783.526  4465.569  2871.775  5676.704  2978.596  2296.195 
##       120       121       122       123       124       125       126 
##  8986.918  2403.017  2706.842  2600.021  2925.185  5561.480  2871.775 
##       127       128       129       130       131       132       133 
##  9360.793  2604.721  6228.696  3085.417  8531.936  2720.434  4532.572 
##       134       135       136       137       138       139       140 
##  2546.610  2706.842  2871.775  5783.526  5997.168  2604.721  5569.883 
##       141       142       143       144       145       146       147 
##  2706.842  5561.480  5057.787  2776.891  2349.606 16925.576  5730.115 
##       148       149       150       151       152       153       154 
##  8773.276  7530.725  2760.253 12634.829  9760.956  9048.732  6264.222 
##       155       156       157       158       159       160       161 
##  2439.789  8781.679  2818.364 12651.169  5801.411  4465.569  8995.321 
##       162       163       164       165       166       167       168 
##  2493.199  4465.569 17139.219  5569.883  3085.417  2818.364  6442.339 
##       169       170       171       172       173       174       175 
##  2925.185  4532.572  2711.543  2403.017  4897.555  4683.912  9066.617 
##       176       177       178       179       180       181       182 
##  5961.643  2760.253  2818.364  2386.378  2453.381 12384.115  5988.765 
##       183       184       185       186       187       188       189 
##  2764.953  2349.606 16461.220  6264.222  4465.569  3085.417  6050.579 
##       190       191       192       193       194       195       196 
##  9360.793  2764.953  2773.845  5463.061 15697.130 12651.169 19829.074 
##       197       198       199       200       201       202       203 
##  3032.007 16194.166  4839.444  9093.740  5569.883  2658.132  2706.842 
##       204       205       206       207       208       209       210 
##  2439.789  2813.664 19578.360  2332.967  5569.883  4465.569  2386.378 
##       211       212       213       214       215 
##  5463.061  2867.074  5694.589 13019.015  5836.936

Setting the competition “salary data” with our regression model created above to predict monthly income.

noinc <- read_csv("data/CaseStudy2CompSet No Salary (2).csv")
noinc[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")] <- lapply(noinc[c("Education","JobLevel", "JobRole","PerformanceRating", "StockOptionLevel")], as.factor)
noinc
preds <-predict(regincome1, newdata =  noinc)
preddf <- data.frame(predicted = preds, ID = noinc$ID)
preddf <- preddf %>% arrange(ID)
write.csv(preddf, "Case2PredictionsSavorgnanSalary.csv")

Setting the competition “no attrition”" data with our model created above to predict attrition

library(readr)
noatt <- read_csv("data/CaseStudy2CompSet No Attrition.csv")
noatt[c("StockOptionLevel", "JobLevel", "OverTime")] <- lapply(noatt[c("StockOptionLevel", "JobLevel", "OverTime")], as.factor)
head(noatt)
test_preda <- predict(fit.nb1, newdata = noatt)
test_preda
##   [1] No  No  No  No  No  No  No  Yes No  No  Yes Yes No  No  Yes No  No 
##  [18] No  No  No  Yes No  Yes No  No  No  No  Yes No  Yes Yes No  No  No 
##  [35] Yes No  No  No  No  No  Yes Yes No  No  Yes No  No  No  Yes No  No 
##  [52] Yes No  No  No  No  Yes No  No  Yes No  No  Yes No  No  No  No  No 
##  [69] No  Yes No  No  No  No  No  Yes No  Yes Yes No  No  No  No  No  No 
##  [86] No  No  No  No  No  No  No  Yes No  No  No  No  No  Yes Yes No  No 
## [103] Yes No  No  No  No  No  Yes No  No  No  No  No  No  No  Yes Yes Yes
## [120] No  No  Yes No  No  No  No  Yes No  No  No  No  No  No  No  No  No 
## [137] No  No  Yes No  No  No  Yes Yes Yes No  No  Yes Yes No  No  No  No 
## [154] No  Yes No  Yes Yes Yes No  No  No  Yes No  No  Yes No  No  Yes No 
## [171] Yes Yes No  Yes No  No  No  No  No  No  No  No  No  No  No  No  No 
## [188] Yes No  No  Yes No  No  No  Yes Yes Yes No  No  No  No  No  Yes No 
## [205] No  Yes No  No  No  No  Yes No  Yes No  No  Yes No  No  No  No  Yes
## [222] No  No  No  No  Yes Yes No  Yes No  Yes Yes No  Yes Yes Yes Yes No 
## [239] No  No  Yes Yes No  Yes No  No  Yes Yes No  Yes Yes No  No  Yes No 
## [256] No  No  Yes Yes No  No  No  No  No  No  No  No  No  Yes No  No  No 
## [273] No  Yes Yes Yes No  No  No  Yes No  No  No  No  No  Yes Yes Yes Yes
## [290] Yes No  No  No  No  Yes Yes No  Yes No  No 
## Levels: No Yes
predi <- data.frame(predicted = test_preda, ID = noatt$ID)
predited <- predi %>% arrange(ID)
write.csv(predited,"Case2PredictionsSavorgnanAttrition.csv")